FOODHUB
OPERATIONS OF FOODHUB APP
OBJECTIVE
DATA DICTIONARY
#Importing the necessary libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
%matplotlib inline
# mounting to drive
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
# Reading a data file
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/foodhub_order.csv')
# checking first five rows of the dataset
df.head()
| order_id | customer_id | restaurant_name | cuisine_type | cost_of_the_order | day_of_the_week | rating | food_preparation_time | delivery_time | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1477147 | 337525 | Hangawi | Korean | 30.75 | Weekend | Not given | 25 | 20 |
| 1 | 1477685 | 358141 | Blue Ribbon Sushi Izakaya | Japanese | 12.08 | Weekend | Not given | 25 | 23 |
| 2 | 1477070 | 66393 | Cafe Habana | Mexican | 12.23 | Weekday | 5 | 23 | 28 |
| 3 | 1477334 | 106968 | Blue Ribbon Fried Chicken | American | 29.20 | Weekend | 3 | 25 | 15 |
| 4 | 1478249 | 76942 | Dirty Bird to Go | American | 11.59 | Weekday | 4 | 25 | 24 |
#checking last five rows of the dataset
df.tail()
| order_id | customer_id | restaurant_name | cuisine_type | cost_of_the_order | day_of_the_week | rating | food_preparation_time | delivery_time | |
|---|---|---|---|---|---|---|---|---|---|
| 1893 | 1476701 | 292602 | Chipotle Mexican Grill $1.99 Delivery | Mexican | 22.31 | Weekend | 5 | 31 | 17 |
| 1894 | 1477421 | 397537 | The Smile | American | 12.18 | Weekend | 5 | 31 | 19 |
| 1895 | 1477819 | 35309 | Blue Ribbon Sushi | Japanese | 25.22 | Weekday | Not given | 31 | 24 |
| 1896 | 1477513 | 64151 | Jack's Wife Freda | Mediterranean | 12.18 | Weekday | 5 | 23 | 31 |
| 1897 | 1478056 | 120353 | Blue Ribbon Sushi | Japanese | 19.45 | Weekend | Not given | 28 | 24 |
# Understanding the shape of data set---no.of rows vs no.columns
df.shape
(1898, 9)
OBSERVATION :
What are the datatypes of the different columns in the dataset? [0.5 mark]
# Analyzing the data types present in the data set
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1898 entries, 0 to 1897 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 1898 non-null int64 1 customer_id 1898 non-null int64 2 restaurant_name 1898 non-null object 3 cuisine_type 1898 non-null object 4 cost_of_the_order 1898 non-null float64 5 day_of_the_week 1898 non-null object 6 rating 1898 non-null object 7 food_preparation_time 1898 non-null int64 8 delivery_time 1898 non-null int64 dtypes: float64(1), int64(4), object(4) memory usage: 133.6+ KB
OBSERVATION :
# changing order_id and customer_id datatype
df['order_id'] = df['order_id'].astype(str)
df['customer_id'] = df['customer_id'].astype(str)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1898 entries, 0 to 1897 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 1898 non-null object 1 customer_id 1898 non-null object 2 restaurant_name 1898 non-null object 3 cuisine_type 1898 non-null object 4 cost_of_the_order 1898 non-null float64 5 day_of_the_week 1898 non-null object 6 rating 1898 non-null object 7 food_preparation_time 1898 non-null int64 8 delivery_time 1898 non-null int64 dtypes: float64(1), int64(2), object(6) memory usage: 133.6+ KB
OBSERVATION :
# changing 'rating' entries 'Not given' to nan and change its datatype
#Int(64)--pandas-specific nullable integer type that can handle missing values (NaNs).
df['rating'] = df['rating'].replace('Not given', np.nan)
df['rating'] = df['rating'].astype('Int64')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1898 entries, 0 to 1897 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 1898 non-null object 1 customer_id 1898 non-null object 2 restaurant_name 1898 non-null object 3 cuisine_type 1898 non-null object 4 cost_of_the_order 1898 non-null float64 5 day_of_the_week 1898 non-null object 6 rating 1162 non-null Int64 7 food_preparation_time 1898 non-null int64 8 delivery_time 1898 non-null int64 dtypes: Int64(1), float64(1), int64(2), object(5) memory usage: 135.4+ KB
OBSERVATION :
# New info of datatype
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1898 entries, 0 to 1897 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 1898 non-null object 1 customer_id 1898 non-null object 2 restaurant_name 1898 non-null object 3 cuisine_type 1898 non-null object 4 cost_of_the_order 1898 non-null float64 5 day_of_the_week 1898 non-null object 6 rating 1162 non-null Int64 7 food_preparation_time 1898 non-null int64 8 delivery_time 1898 non-null int64 dtypes: Int64(1), float64(1), int64(2), object(5) memory usage: 135.4+ KB
OBSERVATION :
Are there any missing values in the data? If yes, treat them using an appropriate method. [1 mark]
# checking if any duplicate data is present in the dataset
df.duplicated().sum()
0
OBSERVATION :
# checking if missing data is present with in the datset
df.isnull().sum()
order_id 0 customer_id 0 restaurant_name 0 cuisine_type 0 cost_of_the_order 0 day_of_the_week 0 rating 736 food_preparation_time 0 delivery_time 0 dtype: int64
OBSERVATION :
How many orders are not rated? [1 mark]
# Checking the percentage of data missing in the 'rating' column.
(df['rating'].isnull().sum()/len(df))*100
38.77766069546891
OBSERVATION :
#unique values which are present in 'rating' column
df['rating'].unique()
<IntegerArray> [<NA>, 5, 3, 4] Length: 4, dtype: Int64
# Group by 'cuisine_type'
grouped = df.groupby('cuisine_type')
# Function to calculate mode
def mode(substitute):
if not substitute.mode().empty:
return substitute.mode().iloc[0]
else:
return substitute
# Apply the function to each group and create a Series with the mode values
rating_cuisine_with_mode = grouped['rating'].apply(mode)
# Map the mode values back to the original dataframe
rating_cuisine_with_mode = rating_cuisine_with_mode.reindex(df['cuisine_type']).values
# Fill the NaN values in 'rating' with mode values from the group
df['rating'] = df['rating'].combine_first(pd.Series(rating_cuisine_with_mode))
# Verifying the 'rating' column after filling with mode values
df['rating'].isnull().sum()
0
# Verifying the first 5 entries which had null values earlier
df.head()
| order_id | customer_id | restaurant_name | cuisine_type | cost_of_the_order | day_of_the_week | rating | food_preparation_time | delivery_time | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1477147 | 337525 | Hangawi | Korean | 30.75 | Weekend | 4 | 25 | 20 |
| 1 | 1477685 | 358141 | Blue Ribbon Sushi Izakaya | Japanese | 12.08 | Weekend | 5 | 25 | 23 |
| 2 | 1477070 | 66393 | Cafe Habana | Mexican | 12.23 | Weekday | 5 | 23 | 28 |
| 3 | 1477334 | 106968 | Blue Ribbon Fried Chicken | American | 29.20 | Weekend | 3 | 25 | 15 |
| 4 | 1478249 | 76942 | Dirty Bird to Go | American | 11.59 | Weekday | 4 | 25 | 24 |
df['rating'].describe()
count 1898.0 mean 4.584299 std 0.665836 min 3.0 25% 4.0 50% 5.0 75% 5.0 max 5.0 Name: rating, dtype: Float64
INSIGHT :
Check the statistical summary of the data. What is the minimum, average, and maximum time it takes for food to be prepared once an order is placed? [2 marks]
# checking the statistical summary of the dataset
# T - Transpose for easy readability
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| cost_of_the_order | 1898.0 | 16.498851 | 7.483812 | 4.47 | 12.08 | 14.14 | 22.2975 | 35.41 |
| rating | 1898.0 | 4.584299 | 0.665836 | 3.0 | 4.0 | 5.0 | 5.0 | 5.0 |
| food_preparation_time | 1898.0 | 27.37197 | 4.632481 | 20.0 | 23.0 | 27.0 | 31.0 | 35.0 |
| delivery_time | 1898.0 | 24.161749 | 4.972637 | 15.0 | 20.0 | 25.0 | 28.0 | 33.0 |
OBSERVATION :
Minimum cost of order and maximum cost of order having a vast difference might be because of
df['food_preparation_time'].describe()
count 1898.000000 mean 27.371970 std 4.632481 min 20.000000 25% 23.000000 50% 27.000000 75% 31.000000 max 35.000000 Name: food_preparation_time, dtype: float64
minimum_time=df['food_preparation_time'].min()
average_time=round(df['food_preparation_time'].mean())
maximum_time=df['food_preparation_time'].max()
print('Time Taken For Food Preparation')
# minum time taken for preparation of food
print("Minimum :",minimum_time,"minutes")
# average time taken for preparation of food
print("Average :",average_time,"minutes")
# maximum time taken for preparation of food
print("Maximum :",maximum_time,"minutes")
Time Taken For Food Preparation Minimum : 20 minutes Average : 27 minutes Maximum : 35 minutes
INSIGHT
# Creating a column 'total_order_time'
df['customer_wait_time'] = df['food_preparation_time'] + df['delivery_time']
df.head()
| order_id | customer_id | restaurant_name | cuisine_type | cost_of_the_order | day_of_the_week | rating | food_preparation_time | delivery_time | customer_wait_time | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1477147 | 337525 | Hangawi | Korean | 30.75 | Weekend | 4 | 25 | 20 | 45 |
| 1 | 1477685 | 358141 | Blue Ribbon Sushi Izakaya | Japanese | 12.08 | Weekend | 5 | 25 | 23 | 48 |
| 2 | 1477070 | 66393 | Cafe Habana | Mexican | 12.23 | Weekday | 5 | 23 | 28 | 51 |
| 3 | 1477334 | 106968 | Blue Ribbon Fried Chicken | American | 29.20 | Weekend | 3 | 25 | 15 | 40 |
| 4 | 1478249 | 76942 | Dirty Bird to Go | American | 11.59 | Weekday | 4 | 25 | 24 | 49 |
df['customer_wait_time'].describe()
count 1898.000000 mean 51.533720 std 6.833603 min 35.000000 25% 47.000000 50% 52.000000 75% 56.000000 max 68.000000 Name: customer_wait_time, dtype: float64
OBSERVATION :
This column is the total time taken for the order to reach the customer from time of acceptance of order by restaurant to the order reaching customer i.e
This is the total time customer waits to receive the order
Explore all the variables and provide observations on their distributions. (Generally, histograms, boxplots, countplots, etc. are used for univariate exploration.) [9 marks]
cost_of_the_order
food_preparation_time
TARGET
# histogram for rating - Showing the distribution of the datapoints
plt.figure(figsize=(7, 5)) # size of the plot to display
sns.histplot(data=df, x="rating",kde=True) # kde=True for kernel density estimate
plt.title('Distribution of Rating') #heading of the plot
plt.xlabel('Rating') # x-axis label
plt.show()
Insight :
# countplot for rating - showing the count of individual rating
plt.figure(figsize=(7, 5)) # size of the plot to display
sns.countplot(x='rating', data=df)
plt.title('Distribution count of Rating') # heading of the plot
plt.xlabel('Rating') # x-axis label
plt.show()
Insight :
TARGET
# visualizing 'cost of the order' using Boxplot
plt.figure(figsize=(7, 5)) #size of the plot displayed
sns.boxplot(data=df, x="cost_of_the_order")
plt.title('Distribution of order cost') # heading of the plot
plt.xlabel('Cost of order ($) ') # x-axis label
plt.show()
Insight :
# violin plot for cost of order for more detailed view
plt.figure(figsize=(7, 5)) #size of the plot displayed
sns.violinplot(data=df, x="cost_of_the_order")
plt.title('Distribution of order cost') # heading of the plot
plt.xlabel('Cost of order in $ ') # x-axis label
plt.show()
Insight :
TARGET
# analysing food preparation time with countplot
sns.countplot(data=df,x='food_preparation_time');
plt.title('Distribution count of food preparation time') # heading of the plot
plt.xlabel('food_preparation_time in minutes') # x-axis label
plt.show()
Insight :
# analysing food preparation time with boxplot
sns.boxplot(data=df,x='food_preparation_time');
plt.title('Distribution of food preparation time') # heading of the plot
plt.xlabel('food_preparation_time in minutes');
Insight :
TARGET
# visualizing delivery time with countplot
plt.figure(figsize=(7, 5)) #size of the plot displayed
sns.countplot(data=df,x='delivery_time');
plt.title('Distribution count of delivery time') # heading of the plot
plt.xlabel('delivery_time in minutes') # x-axis label
plt.show()
Insight :
# visualizing the delivery time datapoint summary with boxplot
sns.boxplot(data=df,x='delivery_time')
plt.title('Distribution of delivery time') # heading of the plot
plt.xlabel('delivery_time in minutes') #x-axis label
plt.show()
Insight :
#visualising total time a customer waits to receive order
sns.countplot(data=df,x='customer_wait_time');
plt.title('Distribution count of customer wait time') # heading of the plot
plt.xlabel('customer_wait_time in minutes'); # x-axis label
plt.xticks(rotation=90)
plt.show()
Insight
# visualizing the distribution of data summary
sns.boxplot(data=df,x='customer_wait_time');
plt.xlabel('customer_wait_time in minutes');
plt.title('Distribution of customer wait time') # heading of the plot
plt.xticks(rotation=90)
plt.show()
Insight
order_id
cuisine_type
# Total no.of unique ids alotted to the customers
df['customer_id'].nunique()
1200
# No.of orders placed with each customer_id
df['customer_id'].value_counts()
customer_id
52832 13
47440 10
83287 9
250494 8
259341 7
..
385426 1
254913 1
289597 1
74412 1
397537 1
Name: count, Length: 1200, dtype: int64
Insight :
df['order_id'].nunique()
1898
Insight :
TARGET
# analyzing cuisine type with countplot
plt.figure(figsize=(10, 5)) #size of the plot displayed
sns.countplot(x="cuisine_type", data=df)
plt.title("Distribution count of orders made for particular cuisine") # heading of the plot
plt.xlabel("Type of cuisine") #x-axis label
plt.xticks(rotation=90)
plt.show()
Insight
TARGET
# analysis of categorical variable -day_of_the_week with countplot
plt.figure(figsize=(8, 5)) #size of the plot displayed
sns.countplot(x="day_of_the_week", data=df, color='aqua')
plt.title("Frequency of orders made for day_of_the_week") # heading of the plot
plt.xlabel("Weekday vs Weekend") #x-axis label
plt.ylabel("Count of Orders") #y-axis label
plt.xticks(rotation=90)
plt.show()
# percentage of orders on weekends
weekend_orders = df[df['day_of_the_week']=='Weekend']
total_orders = len(df)
percentage_weekend_orders = (len(weekend_orders) / total_orders) * 100
print("Percentage of orders on weekends:" ,round(percentage_weekend_orders),"%")
# percentage of orders on weekdays
weekday_orders = df[df['day_of_the_week']=='Weekday']
total_orders = len(df)
percentage_weekday_orders = (len(weekday_orders) / total_orders) * 100
print("Percentage of orders on weekdays:" ,round(percentage_weekday_orders),"%")
Percentage of orders on weekends: 71 % Percentage of orders on weekdays: 29 %
Insight
TARGET
# number of unique values in the 'restaurant_name' column of the DataFrame
df['restaurant_name'].nunique()
178
There are 178 restaurants that are registered i.e those reastaurant receive orders through food hub app. These restaurants are in a way client to the Foodhub app
# count of how many times each value occurs.
df['restaurant_name'].value_counts()
restaurant_name
Shake Shack 219
The Meatball Shop 132
Blue Ribbon Sushi 119
Blue Ribbon Fried Chicken 96
Parm 68
...
Sushi Choshi 1
Dos Caminos Soho 1
La Follia 1
Philippe Chow 1
'wichcraft 1
Name: count, Length: 178, dtype: int64
INSIGHT
# visualizing the count of orders made in restaurant_name column
plt.figure(figsize=(70, 30)) #size of the plot displayed
sns.countplot(x="restaurant_name", data=df)
plt.title(" Count of orders made in each restaurant") # heading of the plot
plt.xlabel("Restaurant Name") #x-axis label
plt.ylabel("Count") #y-label
plt.xticks(rotation=90)
plt.show()
Insight
Which are the top 5 restaurants in terms of the number of orders received? [1 mark]
#the top 5 restaurants in terms of the number of orders received
df['restaurant_name'].value_counts().head(5)
restaurant_name Shake Shack 219 The Meatball Shop 132 Blue Ribbon Sushi 119 Blue Ribbon Fried Chicken 96 Parm 68 Name: count, dtype: int64
Insight
Top five restaurants
Which is the most popular cuisine on weekends? [1 mark]
# visualizing which cuisine is popular on weekend(saturday and sunday)
sns.countplot(x='cuisine_type',data=df,hue='day_of_the_week')
plt.xticks(rotation=90) # rotating the x-axis labels for better readability
plt.title('Popularity of Cuisines on Weekends vs Weekdays') # heading of the plot
plt.xlabel('Cuisine Type') # x-axis label
plt.ylabel('Count of Orders') # y-axis label
plt.show()
Insight
What percentage of the orders cost more than 20 dollars? [2 marks]
# Percentage of orders with cost_of_the_order more than 20$
total_no_of_orders = len(df) #length of the df
orders_above_20 = len(df[df['cost_of_the_order'] > 20]) #length of orders with cost > 20$
percentage = (orders_above_20/ total_no_of_orders) * 100 # calculating percentage of orders with cost > 20$
print('Percentage of orders with cost more than 20$ = ', round(percentage,2),"%")
Percentage of orders with cost more than 20$ = 29.24 %
Insights
What is the mean order delivery time? [1 mark]
# checking the average delivery time taken
mean_delivery_time=df['delivery_time'].mean()
print(round( mean_delivery_time ,2 ))
24.16
Insights
The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed. [1 mark]
# Finding the Ids of top-3 customers and number of orders placed by them
customer_order = df['customer_id'].value_counts() # count of orders for each customer
top_3_customers=customer_order.head(3) # top 3 customers
print(top_3_customers)
customer_id 52832 13 47440 10 83287 9 Name: count, dtype: int64
# creating 2 columns for better display and readability
top_customers_for_discount = top_3_customers.reset_index() # converting the series to dataframe
top_customers_for_discount.columns = ['customer_id', 'order_count'] # renaming the columns
top_customers_for_discount
| customer_id | order_count | |
|---|---|---|
| 0 | 52832 | 13 |
| 1 | 47440 | 10 |
| 2 | 83287 | 9 |
Insight
Top 3 most frequent customers eligible for 20% discount vouchers
custome-ids and no.of orders place
Perform a multivariate analysis to explore relationships between the important variables in the dataset. (It is a good idea to explore relations between numerical variables as well as relations between numerical and categorical variables) [10 marks]
# Creating a heatmap of the numerical variables
plt.figure(figsize=(10, 8))
sns.heatmap(df[['cost_of_the_order', 'food_preparation_time','delivery_time', 'rating']].corr(), annot=True, cmap='coolwarm')
plt.title('Heatmap of Numerical Variables')
plt.show()
Insight
# Analyzing the relation between cost of order and delivery time
plt.figure(figsize=(7, 5)) #size of the plot displayed
sns.regplot(x='cost_of_the_order', y='delivery_time', data=df)
plt.title('Cost of order vs Delivery time') # heading of the plot
plt.xlabel('Cost of order') # x-axis label
plt.ylabel('Delivery time') # y-axis label
plt.grid(True)
plt.show()
Insight
# visualizing boxplot of cost of order and customer waiting time
plt.figure(figsize=(15, 7)) #size of the plot displayed
sns.boxplot(x="customer_wait_time", y="cost_of_the_order", data=df)
plt.title("Cost of Order vs Customer Wait Time") # heading of the plot
plt.xlabel("Customer Wait Time") # x-axis label
plt.xticks(rotation=90)
plt.ylabel("Cost of Order") # y-axis label
plt.show()
Insight
# analysizing the orders each reasturant gets on weekdays and weekends
plt.figure(figsize=(25, 5)) #size of the plot displayed
sns.countplot(x='restaurant_name',data=df,hue='day_of_the_week')
plt.xticks(rotation=90) # rotating the x-axis labels for better readability
plt.title('Popularity of Restaurant on Weekends vs Weekdays') # heading of the plot
plt.xlabel('Restaurant Names') # x-axis label
plt.ylabel('Count of Orders') #number of orders for each restaurant
plt.show()
Insight
# Analysizing cuisine preference depending of day_of_the_week
sns.countplot(x='cuisine_type',data=df,hue='day_of_the_week')
plt.xticks(rotation=90) # rotating the x-axis labels for better readability
plt.title('Preference of Cuisines on Weekends vs Weekdays') # heading of the plot
plt.xlabel('Cuisine Type') #x-axis label
plt.ylabel('Count of Orders') #y-label
plt.show()
Insight
# Visualizing boxplot of cost of order and cuisine type
plt.figure(figsize=(10, 5)) #size of the plot displayed
sns.boxplot(x="cuisine_type", y="cost_of_the_order", data=df)
plt.title("Cost of Order vs Cuisine Type") # heading of the plot
plt.xlabel("Cuisine Type") # x-axis label
plt.xticks(rotation=90)
plt.ylabel("Cost of Order in $") # y-axis label
plt.show()
Insights
# Visualizing barplot of rating and cuisine type
plt.figure(figsize=(10, 5)) #size of the plot displayed
sns.barplot(x="cuisine_type", y="rating", data=df)
plt.title("Rating Vs Cuisine Type") # heading of the plot
plt.xlabel("Cuisine Type") # x-axis label
plt.xticks(rotation=90)
plt.ylabel("Rating") # y-axis label
plt.show()
Insight
# visualizing Boxplot of customer_wait_time vs. day_of_the_week.
plt.figure(figsize=(8, 5)) #size of the plot displayed
sns.boxplot(x="day_of_the_week", y="customer_wait_time", data=df)
plt.title("Customer Wait Time vs Day of the Week") # heading of the plot
plt.xlabel("Day of the Week") # x-axis label
plt.xticks(rotation=90)
plt.ylabel("Customer Wait Time") # y-axis label
plt.show()
Insight
# visualizing Boxplot of food_preparation_time vs. cuisine_type.
plt.figure(figsize=(8, 5)) #size of the plot displayed
sns.boxplot(x="cuisine_type", y="food_preparation_time", data=df)
plt.title("Food Preparation Time vs Cuisine Type") # heading of the plot
plt.xlabel("Cuisine Type") # x-axis label
plt.xticks(rotation=90)
plt.ylabel("Food Preparation Time") # y-axis label
plt.show()
Insight
# visualizing boxplot of cuisine and customer waiting time
plt.figure(figsize=(15, 7)) #size of the plot displayed
sns.boxplot(y="customer_wait_time", x="cuisine_type", data=df)
plt.title("Cuisine Type vs Customer Wait Time") # heading of the plot
plt.ylabel("Customer Wait Time") # x-axis label
plt.xticks(rotation=90)
plt.xlabel("cuisine_type") # y-axis label
plt.show()
Insight
# visualizing boxplot of food_preparation_time and day_of_the_week
plt.figure(figsize=(8,5)) #size of the plot displayed
sns.boxplot(y="day_of_the_week", x="food_preparation_time", data=df)
plt.title("food preparation time vs day_of_the_week") # heading of the plot
plt.ylabel("day of the week") # y-axis label
plt.xticks(rotation=90)
plt.xlabel("food preparation time") # x-axis label
plt.show()
Insight
# visualizing boxplot of delivery_time and cuisine type
plt.figure(figsize=(8,5)) #size of the plot displayed
sns.boxplot(y="delivery_time", x="cuisine_type", data=df)
plt.title("Delivery time vs Cuisine type") # heading of the plot
plt.ylabel("delivery_time") # y-axis label
plt.xticks(rotation=90)
plt.xlabel("cuisine_type") # x-axis label
plt.show()
Insight
# visualizing boxplot of delivery_time and cuisine type
plt.figure(figsize=(8,4)) #size of the plot displayed
sns.boxplot(y="delivery_time", x="day_of_the_week", data=df)
plt.title("Delivery time vs Day of the Week") # heading of the plot
plt.ylabel("delivery_time") # y-axis label
plt.xticks(rotation=90)
plt.xlabel("Day of the week") # x-axis label
plt.show()
Insight
cost_of_the_order vs. customer_wait_time vs. cuisine_type¶TARGET:
# Analyzing using scatterplot
plt.figure(figsize=(30, 10)) #size of the plot displayed
sns.scatterplot(x="cost_of_the_order", y="customer_wait_time", hue="cuisine_type", data=df)
plt.title("Cost of the Order vs Customer Wait Time vs Cuisine Type") # heading of the plot
plt.xlabel("Cost of the Order") # x-axis label
plt.ylabel("Customer Wait Time") # y-axis label
plt.legend(title="Cuisine Type")
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.grid(True)
plt.show()
# As from method of sampling suggested on COMMUNITY BLOCK of dashboard
# Making a Sample DataFrame for understanding the plot from above which is clustered
np.random.seed(0) #random numbers generated that are reproducible
data = {"cost_of_the_order": np.random.randint(5, 35, 800), #sample size of 800
"customer_wait_time": np.random.randint(30, 70, 800),
"cuisine_type": np.random.choice(['American','Japanese','Italian','Chinese','Mexican' ], 800)} #considering top 5 restaurants
sample_df = pd.DataFrame(data) # new sameple dataframe
plt.figure(figsize=(12, 6)) #size of the plot displayed
sns.scatterplot(x="cost_of_the_order", y="customer_wait_time", hue="cuisine_type", data=sample_df,) #making a plot for sample for sample
plt.title("Cost of the Order vs Customer Wait Time vs Cuisine Type")
plt.xlabel("Cost of the Order") # x-axis label
plt.ylabel("Customer Wait Time") # y-axis label
plt.legend(title="Cuisine Type") # legend box title
plt.legend(bbox_to_anchor=(1, 1), loc='upper left') # adjusting the legend box out of plot for readability
plt.grid(True) #grid view for easy understanding of placement of datapoints
plt.show()
Insights
rating vs. cost_of_the_order vs. cuisine_type¶TARGET
# Analyzing using scatterplot
plt.figure(figsize=(30, 10)) #size of the plot displayed
sns.scatterplot(x="cost_of_the_order", y="rating", hue="cuisine_type", data=df)
plt.title("Cost of the Order vs Rating vs Cuisine Type") # heading of the plot
plt.xlabel("Cost of the Order") # x-axis label
plt.ylabel("rating") # y-axis label
plt.legend(title="cuisine type")
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.grid(True)
plt.show()
Insight
food_preparation_time vs. cost_of_the_order vs. cuisine_type¶TARGET
#Analyzing using boxplot
plt.figure(figsize=(10, 5)) #size of the plot displayed
sns.scatterplot(x="food_preparation_time", y="cost_of_the_order", hue="cuisine_type", data=df)
plt.title("food_preparation_time vs cost_of_the_order vs cuisine_type") # heading of the plot
plt.xlabel("food_preparation_time") # x-axis label
plt.ylabel("cost_of_the_order") # y-axis label
plt.legend(bbox_to_anchor=(1, 1), loc='upper left') # adjusting the legend box out of plot for readability
plt.grid(True)
plt.show()
# As from method of sampling suggested on COMMUNITY BLOCK of dashboard
# Making a Sample DataFrame for understanding the plot from above which is clustered
np.random.seed(0) #random numbers generated that are reproducible
data = {"cost_of_the_order": np.random.randint(5, 35, 500), #sample size of 500
"food_preparation_time": np.random.randint(30, 70, 500),
"cuisine_type": np.random.choice(['American','Japanese','Italian','Chinese','Mexican' ], 500)} #considering top 5 restaurants
sample_df = pd.DataFrame(data) # new sameple dataframe
plt.figure(figsize=(20, 8)) #size of the plot displayed
sns.scatterplot(x="cost_of_the_order", y="food_preparation_time", hue="cuisine_type", data=sample_df) #making a plot for sample for sample
plt.title("Cost of the Order vs food_preparation Time vs Cuisine Type")
plt.xlabel("Cost of the Order") # x-axis label
plt.ylabel("food_preparation Time") # y-axis label
plt.legend(title="Cuisine Type") # legend box title
plt.legend(bbox_to_anchor=(1, 1), loc='upper left') # adjusting the legend box out of plot for readability
plt.grid(True) #grid view for easy understanding of placement of datapoints
plt.show()
Insight
rating vs. food_preparation_time vs. cuisine_type¶TARGET
# Analyzing using scatterplot
plt.figure(figsize=(30, 10)) #size of the plot displayed
sns.scatterplot(x="food_preparation_time", y="rating", hue="cuisine_type", data=df)
plt.title("food_preparation_time vs Rating vs Cuisine Type") # heading of the plot
plt.xlabel("food_preparation_timer") # x-axis label
plt.ylabel("rating") # y-axis label
plt.legend(title="cuisine type")
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.grid(True)
plt.show()
Insight
cost_of_the_order vs. food_preparation_time vs. restaurant_name¶TARGET
# Analyzing using scatterplot
plt.figure(figsize=(30, 10)) #size of the plot displayed
sns.scatterplot(x="cost_of_the_order", y="food_preparation_time", hue="restaurant_name", data=df)
plt.title("Cost of the Order vs food_preparation_time vs restaurant_name") # heading of the plot
plt.xlabel("Cost of the Order") # x-axis label
plt.ylabel("food_preparation_time") # y-axis label
plt.legend(title="restaurant_name")
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.grid(True)
plt.show()
df['restaurant_name'].value_counts().head(5)
restaurant_name Shake Shack 219 The Meatball Shop 132 Blue Ribbon Sushi 119 Blue Ribbon Fried Chicken 96 Parm 68 Name: count, dtype: int64
# As from method of sampling suggested on COMMUNITY BLOCK of dashboard
# Making a Sample DataFrame for understanding the plot from above which is clustered
np.random.seed(0) #random numbers generated that are reproducible
data = {"cost_of_the_order": np.random.randint(5, 35, 100), #sample size of 100
"food_preparation_time": np.random.randint(30, 70, 100),
"restaurant_name": np.random.choice(['Shake Shack','The Meatball Shop','Blue Ribbon Sushi','Blue Ribbon Fried Chicken','Parm' ], 100)} #considering top 5 restaurants
sample_df = pd.DataFrame(data) # new sameple dataframe
plt.figure(figsize=(12, 6)) #size of the plot displayed
sns.scatterplot(x="cost_of_the_order", y="food_preparation_time", hue="restaurant_name", data=sample_df,) #making a plot for sample for sample
plt.title("Cost of the Order vs food_preparation_time vs Restaurant Name")
plt.xlabel("Cost of the Order") # x-axis label
plt.ylabel("Food Preparation Time") # y-axis label
plt.legend(title="restaurant_name") # legend box title
plt.legend(bbox_to_anchor=(1, 1), loc='upper left') # adjusting the legend box out of plot for readability
plt.grid(True) #grid view for easy understanding of placement of datapoints
plt.show()
Insight
delivery_time vs. cost_of_the_order vs. restaurant_name¶TARGET
# Analyzing using scatterplot
plt.figure(figsize=(30, 10)) #size of the plot displayed
sns.scatterplot(x="cost_of_the_order", y="delivery_time", hue="restaurant_name", data=df)
plt.title("Cost of the Order vs delivery_time vs restaurant_name") # heading of the plot
plt.xlabel("Cost of the Order") # x-axis label
plt.ylabel("delivery_time") # y-axis label
plt.legend(title="restaurant_name")
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.grid(True)
plt.show()
# As from method of sampling suggested on COMMUNITY BLOCK of dashboard
# Making a Sample DataFrame for understanding the plot from above which is clustered
np.random.seed(0) #random numbers generated that are reproducible
data = {"cost_of_the_order": np.random.randint(5, 35, 100), #sample size of 100
"delivery_time": np.random.randint(30, 70, 100),
"restaurant_name": np.random.choice(['Shake Shack','The Meatball Shop','Blue Ribbon Sushi','Blue Ribbon Fried Chicken','Parm' ], 100)} #considering top 5 restaurants
sample_df = pd.DataFrame(data) # new sameple dataframe
plt.figure(figsize=(12, 6)) #size of the plot displayed
sns.scatterplot(x="cost_of_the_order", y="delivery_time", hue="restaurant_name", data=sample_df,) #making a plot for sample for sample
plt.title("Cost of the Order vs delivery_time vs Restaurant Name")
plt.xlabel("Cost of the Order") # x-axis label
plt.ylabel("Delivery Time") # y-axis label
plt.legend(title="restaurant_name") # legend box title
plt.legend(bbox_to_anchor=(1, 1), loc='upper left') # adjusting the legend box out of plot for readability
plt.grid(True) #grid view for easy understanding of placement of datapoints
plt.show()
Insight
rating vs. food_preparation_time vs. day_of_the_week¶TARGET
#Analyzing using boxplot
plt.figure(figsize=(10, 5)) #size of the plot displayed
sns.catplot(x="rating", y="food_preparation_time", hue="day_of_the_week", data=df,kind='box')
plt.title("Rating vs Food Preparation Time vs Day of the Week") # heading of the plot
plt.xlabel("Rating") # x-axis label
plt.ylabel("Food preparation Time") # y-axis label
plt.show()
<Figure size 1000x500 with 0 Axes>
Insight
rating vs. delivery_time vs. day_of_the_week¶TARGET:
#Analyzing using boxplot
plt.figure(figsize=(10, 5)) #size of the plot displayed
sns.catplot(x="rating", y="delivery_time", hue="day_of_the_week", data=df,kind='box')
plt.title("Rating vs Delivery Time vs Day of the Week") # heading of the plot
plt.xlabel("Rating") # x-axis label
plt.ylabel("Delivery Time") # y-axis label
plt.show()
<Figure size 1000x500 with 0 Axes>
Insight
The company wants to provide a promotional offer in the advertisement of the restaurants. The condition to get the offer is that the restaurants must have a rating count of more than 50 and the average rating should be greater than 4. Find the restaurants fulfilling the criteria to get the promotional offer. [3 marks]
# Group by restaurant_name and calculate the count and average of the ratings
grouped = df.groupby('restaurant_name').agg(count=('rating', 'count'),average=('rating', 'mean')).reset_index()
eligible_restaurants = grouped[(grouped['count'] > 50) &(grouped['average'] > 4)] # Filter based on the given conditons
print(eligible_restaurants)
restaurant_name count average 20 Blue Ribbon Fried Chicken 96 4.552083 21 Blue Ribbon Sushi 119 4.521008 109 Parm 68 4.5 121 RedFarm Broadway 59 4.474576 122 RedFarm Hudson 55 4.490909 136 Shake Shack 219 4.561644 153 The Meatball Shop 132 4.689394
Insight
RESTAURANT NAME ------------COUNT / AVERAGE
The company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Find the net revenue generated by the company across all orders. [3 marks]
# calculating the commision charged by the company
def calculate_net_revenue(cost_of_the_order):
if cost_of_the_order > 20:
return cost_of_the_order * 0.25 # 25% on orders cost more than 20 $
if cost_of_the_order > 5:
return cost_of_the_order * 0.15 # 15% on orders cost more than 5 $
# calculating the net revenue
df['net_revenue'] = df['cost_of_the_order'].apply(calculate_net_revenue)
print('Revenue Generated By The Company :',df['net_revenue'].sum(),"$")
#calculating the total revenue which is sum of commision charged by the company as well as restaurant bill
# Total amount paid by the customer for these orders (ignoring taxes and other charges if any)
def calculate_total_revenue(cost_of_the_order):
if cost_of_the_order > 20:
return cost_of_the_order * 0.25 +cost_of_the_order # 25% on orders cost more than 20 $
if cost_of_the_order > 5:
return cost_of_the_order * 0.15 +cost_of_the_order # 15% on orders cost more than 5 $
# calculating the total revenue
df['total_revenue'] = df['cost_of_the_order'].apply(calculate_total_revenue)
print('Total Amount Paid By The Customers :',df['total_revenue'].sum(),"$")
Revenue Generated By The Company : 6166.303 $ Total Amount Paid By The Customers : 37438.383 $
Insight
The company wants to analyze the total time required to deliver the food. What percentage of orders take more than 60 minutes to get delivered from the time the order is placed? (The food has to be prepared and then delivered.) [2 marks]
#analysing total delivery time
# customer_wait_time column = food preparation time + delivery time taken
number_of_orders_abv_60_minutes = len(df[df['customer_wait_time'] > 60])
percentage_orders_abv_60_minutes = (number_of_orders_abv_60_minutes / len(df)) * 100
print("Percentage of orders that take more than 60 minutes to get delivered:",round(percentage_orders_abv_60_minutes,2),"%")
Percentage of orders that take more than 60 minutes to get delivered: 10.54 %
Insight
The company wants to analyze the delivery time of the orders on weekdays and weekends. How does the mean delivery time vary during weekdays and weekends? [2 marks]
df['day_of_the_week'].value_counts()
day_of_the_week Weekend 1351 Weekday 547 Name: count, dtype: int64
# mean delivery time on weekend
mean_delivery_time_weekend = df[df['day_of_the_week'] == 'Weekend']['delivery_time'].mean()
print("Mean delivery time on weekend:", mean_delivery_time_weekend, "minutes")
# mean delivery time on weekday
mean_delivery_time_weekday = df[df['day_of_the_week'] == 'Weekday']['delivery_time'].mean()
print("Mean delivery time on weekday:", mean_delivery_time_weekday, "minutes")
# variation in mean delivery time on weekdays ,weekends
mean_variation = -mean_delivery_time_weekend + mean_delivery_time_weekday
print("Mean delivery time variation:", mean_variation, "minutes")
Mean delivery time on weekend: 22.4700222057735 minutes Mean delivery time on weekday: 28.340036563071298 minutes Mean delivery time variation: 5.870014357297798 minutes
Insights
What are your conclusions from the analysis? What recommendations would you like to share to help improve the business? (You can use cuisine type and feedback ratings to drive your business recommendations.) [6 marks]
Restaurant names : The top ordered restaurants should be priortized in maitaining the demand and least opted restaurants need attention in promotion and reasons should be drilled in order to increase their orders,least ordered restaurants should be advertised a little and give data regarding the customer orderpatterns etc which might help in improving their menu , order quality etc.
Customer ids : Customers who make frequent orders should have some benefits so as to not lose them and maintain their loyality .
Day of the week: Weekday vs. Weekend Orders have noticeable difference in number of orders. With regards to this strategies to maintain the demand on weekends and increase the demand on weekdays has to be planned
Cuisine Types: Certain cuisine types are more popular than others, with some being ordered very infrequently. Popular cuisines should be prioritized for strategies with respect to ratings .
Orders : There are some restaurants with significantly lower orders that need attention.and top restaurants with lots of orders should be assisted with more dleivery personnels.
Food preparation time : Longer preparation times may correlate with lower ratings , decrease in food preparation time can decrease customers waiting time
Cost of order : The cost of orders have outliers indicating either very high or very low order values which means there are bulk or huge orders made ,such customers should be held on.
THE FOODHUB AGGREGATORS ARE SERVICE PROVIDERS TO BOTH RESTAURANTS REGISTERED ON APP AND CUSTOMERS MAKING ORDERS THROUGH APP.
Recommendations to FOODHUB app
Maintain a user friendly interface of the app with constant updates ,simplify navigation and reduce the number of steps to complete an order.
Use andvanced techniques of AI and machine learning to provide personalized recommendations and deals based on user behavior and preferences and understanding the traffic hours or order frequency hours etc.
Highlight ongoing promotions,offers and discounts prominently within the app which are exclusive in-app deals
Maintain sufficient and efficient delivery personnels that compensate the demand specifically on weekends.
Provide real-time tracking , instant customer assistance , extra tips to quick delivery etc .
Maintain security measures to protect user data and transaction information.
Encourage customers to leave reviews and ratings for restaurants and their orders.Use it to continuously improve app features and service quality.
The company wants to provide a promotional offer in the advertisement of the restaurants. The condition to get the offer is that the restaurants must have a rating count of more than 50 and the average rating should be greater than 4. Find the restaurants fulfilling the criteria to get the promotional offer. [3 marks]
#Importing the necessary libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
%matplotlib inline
# mounted to drive
from google.colab import drive
drive.mount('/content/drive')
# Reading a data file
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/foodhub_order.csv')
df.info() # gives info of the dataframe
data= df.copy() # making a copy of the dataframe
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 1898 non-null int64
1 customer_id 1898 non-null int64
2 restaurant_name 1898 non-null object
3 cuisine_type 1898 non-null object
4 cost_of_the_order 1898 non-null float64
5 day_of_the_week 1898 non-null object
6 rating 1898 non-null object
7 food_preparation_time 1898 non-null int64
8 delivery_time 1898 non-null int64
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB
# Drop rows where the rating is 'Not given'
data = data[data['rating'] != 'Not given']
# Convert 'rating' to numeric
data['rating'] = pd.to_numeric(data['rating'])
# Group by restaurant name and calculate the count and mean of ratings
restaurant_ratings = data.groupby('restaurant_name').agg(
rating_count=('rating', 'count'),
average_rating=('rating', 'mean')
).reset_index()
# Filter the restaurants fulfilling the criteria
filtered_restaurants = restaurant_ratings[(restaurant_ratings['rating_count'] > 50) & (restaurant_ratings['average_rating'] > 4)]
print(filtered_restaurants)
restaurant_name rating_count average_rating 16 Blue Ribbon Fried Chicken 64 4.328125 17 Blue Ribbon Sushi 73 4.219178 117 Shake Shack 133 4.278195 132 The Meatball Shop 84 4.511905